tidymodelsExample 1 uses data about penguins from the Palmer Archipelago in Antarctica. The data include measurements about three different species of penguins. This example only considers two classes and does not use resampling methods because only one model is estimated.
library(tidyverse)
library(tidymodels)
library(palmerpenguins)
# drop to two species
penguins_small <-
bind_cols(
penguins,
random = runif(nrow(penguins))
) %>%
mutate(
species =
case_when(
species == "Adelie" ~ "Adelie",
species == "Gentoo" ~ "Gentoo",
species == "Chinstrap" & random < 0.5 ~ "Adelie",
species == "Chinstrap" & random > 0.5 ~ "Gentoo"
)
) %>%
mutate(species = factor(species)) %>%
select(-random)
# look at missing data
map_dbl(.x = penguins_small, .f = ~ sum(is.na(.x)))
## species island bill_length_mm bill_depth_mm
## 0 0 2 2
## flipper_length_mm body_mass_g sex year
## 2 2 11 0
# drop missing values
penguins_small <- penguins_small %>%
filter(complete.cases(.))
set.seed(20201013)
# create a split object
penguins_small_split <- initial_split(data = penguins_small, prop = 0.8)
# create the training and testing data
penguins_small_train <- training(x = penguins_small_split)
penguins_small_test <- testing(x = penguins_small_split)
rm(penguins_small)
penguins_small_train %>%
ggplot(aes(x = flipper_length_mm, y = bill_length_mm, color = species)) +
geom_point() +
theme_minimal()
set.seed(20201217)
folds <- vfold_cv(data = penguins_small_train, v = 10)
knn_recipe <-
recipe(formula = species ~ ., data = penguins_small_train) %>%
step_range(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
knn_mod <-
nearest_neighbor(neighbors = tune()) %>%
set_engine(engine = "kknn") %>%
set_mode(mode = "classification")
knn_workflow <-
workflow() %>%
add_model(spec = knn_mod) %>%
add_recipe(recipe = knn_recipe)
knn_grid <- tibble(neighbors = seq(from = 1, to = 15, by = 2))
knn_res <-
knn_workflow %>%
tune_grid(resamples = folds,
grid = knn_grid)
knn_res %>%
collect_metrics()
## # A tibble: 16 x 7
## neighbors .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 1 accuracy binary 0.899 10 0.0207 Preprocessor1_Model1
## 2 1 roc_auc binary 0.907 10 0.0203 Preprocessor1_Model1
## 3 3 accuracy binary 0.899 10 0.0207 Preprocessor1_Model2
## 4 3 roc_auc binary 0.951 10 0.0195 Preprocessor1_Model2
## 5 5 accuracy binary 0.907 10 0.0215 Preprocessor1_Model3
## 6 5 roc_auc binary 0.969 10 0.0180 Preprocessor1_Model3
## 7 7 accuracy binary 0.907 10 0.0215 Preprocessor1_Model4
## 8 7 roc_auc binary 0.972 10 0.0163 Preprocessor1_Model4
## 9 9 accuracy binary 0.903 10 0.0223 Preprocessor1_Model5
## 10 9 roc_auc binary 0.971 10 0.0163 Preprocessor1_Model5
## 11 11 accuracy binary 0.910 10 0.0185 Preprocessor1_Model6
## 12 11 roc_auc binary 0.976 10 0.0144 Preprocessor1_Model6
## 13 13 accuracy binary 0.910 10 0.0185 Preprocessor1_Model7
## 14 13 roc_auc binary 0.976 10 0.0145 Preprocessor1_Model7
## 15 15 accuracy binary 0.907 10 0.0185 Preprocessor1_Model8
## 16 15 roc_auc binary 0.976 10 0.0145 Preprocessor1_Model8
We’re going to repeat exercise 1 with a CART model instead of KNN.
# create a cart model object
cart_mod <-
decision_tree() %>%
set_engine(engine = "rpart") %>%
set_mode(mode = "classification")
cart_workflow <-
workflow() %>%
add_model(spec = cart_mod) %>%
add_recipe(recipe = knn_recipe)
cart_res <-
cart_workflow %>%
tune_grid(resamples = folds,
grid = knn_grid)
cart_res %>%
collect_metrics()
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 accuracy binary 0.870 10 0.0281 Preprocessor1_Model1
## 2 roc_auc binary 0.937 10 0.0196 Preprocessor1_Model1
We compared CART and KNN across the resamples. Let’s estimate CART on all of the training data.
final_mod <- cart_mod %>%
fit(formula = species ~ ., data = penguins_small_train)
# predict the predicted class and the predicted probability of each class
predictions <- bind_cols(
penguins_small_test,
predict(object = final_mod, new_data = penguins_small_test),
predict(object = final_mod, new_data = penguins_small_test, type = "prob")
)
select(predictions, species, starts_with(".pred")) %>%
sample_n(10)
## # A tibble: 10 x 4
## species .pred_class .pred_Adelie .pred_Gentoo
## <fct> <fct> <dbl> <dbl>
## 1 Adelie Adelie 0.981 0.0185
## 2 Gentoo Gentoo 0.0106 0.989
## 3 Gentoo Gentoo 0.0106 0.989
## 4 Gentoo Adelie 0.981 0.0185
## 5 Gentoo Adelie 0.655 0.345
## 6 Gentoo Gentoo 0.0106 0.989
## 7 Gentoo Gentoo 0.0106 0.989
## 8 Adelie Adelie 0.981 0.0185
## 9 Gentoo Gentoo 0.0106 0.989
## 10 Adelie Adelie 0.981 0.0185
rpart.plot::rpart.plot(x = final_mod$fit)
Create a confusion matrix:
conf_mat(data = predictions,
truth = species,
estimate = .pred_class)
## Truth
## Prediction Adelie Gentoo
## Adelie 31 5
## Gentoo 5 25
“Adelie” is the “event”.
\[Accuracy = \frac{TP + TN}{total} = \frac{32 + 27}{66} = \frac{59}{66} \approx 0.894\]
accuracy(data = predictions,
truth = species,
estimate = .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.848
\[Precision = \frac{TP}{TP + FP} = \frac{32}{32 + 4} = \frac{32}{36} \approx 0.889\]
precision(data = predictions,
truth = species,
estimate = .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.861
\[Sensitivity = \frac{32}{32 + 3} = \frac{32}{35} \approx 0.914\]
recall(data = predictions,
truth = species,
estimate = .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.861
Photo by: Lescroël, A. L.; Ballard, G.; Grémillet, D.; Authier, M.; Ainley, D. G. (2014)
new_penguins <- tribble(
~island, ~bill_length_mm, ~bill_depth_mm, ~flipper_length_mm, ~body_mass_g, ~sex, ~year,
"Torgersen", 40, 19, 190, 4000, "male", 2008
)
predict(object = final_mod, new_data = new_penguins)
## # A tibble: 1 x 1
## .pred_class
## <fct>
## 1 Adelie
predict(object = final_mod, new_data = new_penguins, type = "prob")
## # A tibble: 1 x 2
## .pred_Adelie .pred_Gentoo
## <dbl> <dbl>
## 1 0.981 0.0185